Group 9 Shahbaz Masih, Yu Nakamura, Shade Oguntoyinbo
Agenda:
import pandas as pd
import geopandas as gpd
import numpy as np
import datetime as dt
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly as plotly
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import seaborn as sns
py.init_notebook_mode(connected=True)
mpl.style.use('ggplot')
Dataset 1: Climate Change: Earth Surface Temperature Data
Souce: Kaggle
Dataset 2: Greenhouse Gas Reporting Program (GHGRP)
Souce: GHGRP
# Read "global temperature by State" csv file
GlobalTempState = pd.read_csv("./GlobalLandTemperaturesByState.csv")
display(GlobalTempState.head())
# Read "GHG emissino data in Canada" csv file
GHG = pd.read_csv("./PDGES-GHGRP-GHGEmissionsGES-2004-Present.csv", encoding = "ISO-8859-1", engine='python')
# Change columns name:
GHG = GHG[["Reference Year / Année de référence" , "Facility Name / Nom de l'installation", "Facility City or District or Municipality / Ville ou District ou Municipalité de l'installation", "Facility Province or Territory / Province ou territoire de l'installation", "Latitude", "Longitude", "Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)", "English Facility NAICS Code Description / Description du code SCIAN de l'installation en anglais"]]
GHG = GHG.rename({"Reference Year / Année de référence": "YEAR", "Facility Name / Nom de l'installation":"FacilityName", "Facility Province or Territory / Province ou territoire de l'installation":"State", "Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)":"TotalEmission", "Facility City or District or Municipality / Ville ou District ou Municipalité de l'installation":"FacilityCity", "English Facility NAICS Code Description / Description du code SCIAN de l'installation en anglais":"FacilityCode"}, axis='columns')
display(GHG.head())
#Read "Global Temperature" csv file
GLT = pd.read_csv('GlobalTemperatures.csv')
display(GLT.head())
#Read "Global Land Temperatures By Country" csv file
LTbyC = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
display(LTbyC.head())
#Read "country_codes" csv file
ctry = pd.read_csv('country_codes.csv')
ctry.Country = ctry.Country.str.strip()
ctry = ctry.replace(['Russian Federation'],'Russia')
display(ctry.head())
GlobalTemperatures = pd.read_csv("GlobalTemperatures.csv")
display(GlobalTemperatures)
GlobalTemperatures.describe()
#checking nan values
print("Number of NaN values for the column dt :", GlobalTemperatures['dt'].isnull().sum())
print("Number of NaN values for the column LandAverageTemperature :", GlobalTemperatures['LandAverageTemperature'].isnull().sum())
print("Number of NaN values for the column LandMaxTemperature :", GlobalTemperatures['LandMaxTemperature'].isnull().sum())
print("Number of NaN values for the column LandMinTemperature :", GlobalTemperatures['LandMinTemperature'].isnull().sum())
print("Number of NaN values for the column LandAndOceanAverageTemperature :", GlobalTemperatures['LandAndOceanAverageTemperature'].isnull().sum())
#removing nan values
GlobalTemperatures = GlobalTemperatures[GlobalTemperatures['LandAverageTemperature'].notna()]
GlobalTemperatures.rename(columns={'LandAverageTemperature': 'GlobalAverageTemperature'}, inplace=True)
GlobalTemperatures.head()
#Reading country tempertature data and checking statistics
GlobalLandTemperaturesByCountry = pd.read_csv("GlobalLandTemperaturesByCountry.csv")
display(GlobalLandTemperaturesByCountry)
GlobalLandTemperaturesByCountry.describe()
#checking nan values
print("Number of NaN values for the column Average Temperature :", GlobalLandTemperaturesByCountry['AverageTemperature'].isnull().sum())
print("Number of NaN values for the column Average Temperature Uncertainty :", GlobalLandTemperaturesByCountry['AverageTemperatureUncertainty'].isnull().sum())
#removing nan values
GlobalLandTemperaturesByCountry = GlobalLandTemperaturesByCountry[GlobalLandTemperaturesByCountry['AverageTemperature'].notna()]
GlobalLandTemperaturesByCountry.rename(columns={'AverageTemperature': 'CountryAverageTemperature'}, inplace=True)
GlobalLandTemperaturesByCountry.head()
#Reading state tempertature data and checking statistics
GlobalLandTemperaturesByState = pd.read_csv("GlobalLandTemperaturesByState.csv")
display(GlobalLandTemperaturesByState)
GlobalLandTemperaturesByState.describe()
#checking nan values
print("Number of NaN values for the column Average Temperature :", GlobalLandTemperaturesByState['AverageTemperature'].isnull().sum())
print("Number of NaN values for the column Average Temperature Uncertainty :", GlobalLandTemperaturesByState['AverageTemperatureUncertainty'].isnull().sum())
#removing nan values
GlobalLandTemperaturesByState = GlobalLandTemperaturesByState[GlobalLandTemperaturesByState['AverageTemperature'].notna()]
GlobalLandTemperaturesByState.rename(columns={'AverageTemperature': 'StateAverageTemperature'}, inplace=True)
GlobalLandTemperaturesByState.head()
#Reading major city tempertature data and checking statistics
GlobalLandTemperaturesByMajorCity = pd.read_csv("GlobalLandTemperaturesByMajorCity.csv")
display(GlobalLandTemperaturesByMajorCity)
GlobalLandTemperaturesByMajorCity.describe()
#checking nan values
print("Number of NaN values for the column Average Temperature :", GlobalLandTemperaturesByMajorCity['AverageTemperature'].isnull().sum())
print("Number of NaN values for the column Average Temperature Uncertainty :", GlobalLandTemperaturesByMajorCity['AverageTemperatureUncertainty'].isnull().sum())
#removing nan values
GlobalLandTemperaturesByMajorCity = GlobalLandTemperaturesByMajorCity[GlobalLandTemperaturesByMajorCity['AverageTemperature'].notna()]
GlobalLandTemperaturesByMajorCity.shape
GlobalLandTemperaturesByMajorCity.rename(columns={'AverageTemperature':'CityAverageTemperature'}, inplace=True)
GlobalLandTemperaturesByMajorCity.head()
#reading GHG data
CanadaGHGData = pd.read_csv("PDGES-GHGRP-GHGEmissionsGES-2004-Present.csv", encoding = "ISO-8859-1", engine='python')
display(CanadaGHGData)
CanadaGHGData.describe()
#grouping the data by year and summing up the total emissions
YearlyCanadaGHGData =CanadaGHGData[['Reference Year / Année de référence', 'Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)']]
YearlyCanadaGHGData.head()
YearlyCanadaGHGData = YearlyCanadaGHGData.groupby(['Reference Year / Année de référence'], as_index=False)['Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)'].sum()
YearlyCanadaGHGData.head()
#checking outliers by box plots
ax1 = sns.boxplot(data=GlobalTemperatures)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90,fontsize=12)
ax1.set_ylabel('Global Tempertures',fontsize=16)
plt.title('Global Temperatures Box Plots',fontsize=20)
plt.show
#checking outliers by box plots
ax2 = sns.boxplot(data=GlobalLandTemperaturesByCountry)
ax2.set_xticklabels(ax1.get_xticklabels(),rotation=90,fontsize=12)
ax2.set_ylabel('Tempertures',fontsize=16)
plt.title('Global Temperatures By Country Box Plots',fontsize=20)
plt.show
#checking outliers by box plots
ax3 = sns.boxplot(data=GlobalLandTemperaturesByState)
ax3.set_xticklabels(ax1.get_xticklabels(),rotation=90,fontsize=12)
ax3.set_ylabel('Tempertures',fontsize=16)
plt.title('Global Temperatures By State Box Plots',fontsize=20)
#checking outliers by box plots
ax4 = sns.boxplot(data=GlobalLandTemperaturesByMajorCity)
ax4.set_xticklabels(ax1.get_xticklabels(),rotation=90,fontsize=12)
ax4.set_ylabel('Tempertures',fontsize=16)
plt.title('Global Temperatures By Major City Box Plots',fontsize=20)
ax5 = sns.boxplot(data=CanadaGHGData['Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)'])
ax5.set(ylabel='Total Emissions')
#checking outliers by box plots
#ax5 = sns.boxplot(data=CanadaGHGData['Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)'])
#ax5.set_xticklabels(ax1.get_xticklabels(),rotation=90,fontsize=12)
#ax5.set_ylabel('Total Emissions',fontsize=16)
plt.title('Total GHG Emissions Box Plot',fontsize=20)
sns.distplot(CanadaGHGData['Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)'], kde=False, color='blue', bins=100)
plt.title('Total Emissions', fontsize=18)
plt.xlabel('Total Emissions (tonnes CO2e)', fontsize=16)
plt.ylabel('Frequency', fontsize=16)
#IQR score method to remove the outliers
GHGData = CanadaGHGData[['Reference Year / Année de référence', 'Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)']]
GHGData.head()
Q1 = GHGData.quantile(0.25)
Q3 = GHGData.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
df2 = GHGData[~((GHGData < (Q1 - 1.5 * IQR)) |(GHGData > (Q3 + 1.5 * IQR))).any(axis=1)]
df2.shape
#Box plot after removing outliers
ax6 = sns.boxplot(data=df2['Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)'])
ax6.set(ylabel='Total Emissions')
plt.title('Total GHG Emissions Box Plot After outliers removal',fontsize=20)
#https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv
citiesdata = pd.read_csv("cities.csv")
citiesdata.head()
#subcountry was renamed to stat
citiesdata.rename(columns={'name': 'City', 'subcountry': 'State'}, inplace=True)
citiesdata.head()
citieswithstates = citiesdata[['City', 'State']]
citieswithstates.head()
#cities and states temperatures data were merged together
GlobalLandTemperaturesByMajorCitywithstate = pd.merge(GlobalLandTemperaturesByMajorCity, citieswithstates, on='City')
GlobalLandTemperaturesByMajorCitywithstate.head()
citiesandstates = GlobalLandTemperaturesByMajorCitywithstate.merge(GlobalLandTemperaturesByState, on=['dt', 'State', 'Country'])
citiesandstates.head()
#cities and states data was merged with countries data
citiestatesandcountries = citiesandstates.merge(GlobalLandTemperaturesByCountry, on=['dt', 'Country'])
citiestatesandcountries.head()
#cities, states and countries data was merged with global data
citiestatescountriesandglobe = GlobalTemperatures.merge(citiestatesandcountries, on=['dt'])
citiestatescountriesandglobe.head()
#checking if there are any null values
citiestatescountriesandglobe.info(verbose=True, null_counts=True)
#calculating delta Temperature for city, state, country and globe
citiestatescountriesandglobe= citiestatescountriesandglobe.sort_values(by =['City', 'dt'] )
citiestatescountriesandglobe['GlobalDeltaT'] = citiestatescountriesandglobe.groupby('City')['GlobalAverageTemperature'].diff(-1) * (-1)
citiestatescountriesandglobe['StateDeltaT'] = citiestatescountriesandglobe.groupby('City')['StateAverageTemperature'].diff(-1) * (-1)
citiestatescountriesandglobe['CountryDeltaT'] = citiestatescountriesandglobe.groupby('City')['CountryAverageTemperature'].diff(-1) * (-1)
citiestatescountriesandglobe['CityDeltaT'] = citiestatescountriesandglobe.groupby('City')['CityAverageTemperature'].diff(-1) * (-1)
citiestatescountriesandglobe.head()
#separating absolute values of temperature for city, state, country and globe
temperaturedata = citiestatescountriesandglobe[['dt', 'City', 'GlobalAverageTemperature', 'CountryAverageTemperature',
'StateAverageTemperature', 'CityAverageTemperature' ]]
temperaturedata.head()
#separating delta temperatures for city, state, country and globe
temperaturechangedata = citiestatescountriesandglobe[['dt', 'City','GlobalDeltaT', 'CountryDeltaT',
'StateDeltaT', 'CityDeltaT' ]]
temperaturechangedata.head()
#Chicago temperature data
ChicagoTempData= temperaturedata[temperaturedata['City'] == 'Chicago']
ChicagoTempChangeData= temperaturechangedata[temperaturedata['City'] == 'Chicago']
#name of cities in the final dataset
citiestatescountriesandglobe['City'].unique()
fig, ax = plt.subplots(1,2, figsize=(13,7))
ChicagoTempData.plot(x="dt", y=['GlobalAverageTemperature', 'CountryAverageTemperature',
'StateAverageTemperature', 'CityAverageTemperature' ], ax=ax[0])
ChicagoTempChangeData.plot(x="dt", y=['GlobalDeltaT', 'CountryDeltaT',
'StateDeltaT', 'CityDeltaT'], ax=ax[1])
temperaturedata.corr()
sns.heatmap(temperaturedata.corr(), annot = True)
temperaturechangedata.corr()
sns.heatmap(temperaturechangedata.corr(), annot = True)
How has average land surface temperature changed from 1775 to 2015? Was any dramatic temperature change observed? Can we confirm the global warming from this dataset?
#Cleaning, preparing and wrangling data
GLT['dt'] = pd.to_datetime(GLT['dt'])
yseries = GLT['dt'].dt.year
mseries = GLT['dt'].dt.month
GLT = pd.DataFrame({'Year' : yseries, 'Month' : mseries, 'LandAvgTemp' : GLT['LandAverageTemperature']})
grouped = GLT.groupby(['Year','Month'])
GLT_sum = grouped.sum()
GLT_sum = GLT_sum.loc[1775:2015]
yrlyLandTemp = GLT_sum.mean(axis=1, skipna=True).mean(level=['Year']).reset_index()
yrlyLandTemp = yrlyLandTemp.rename(columns={0:'LandAvgTemp'})
yrlyLandTemp_20 = yrlyLandTemp.loc[::5]
TotalTemp1775 = yrlyLandTemp_20.loc[(yrlyLandTemp_20.Year==1775), 'LandAvgTemp']
yrlyLandTemp_20['% Change since 1775']=yrlyLandTemp_20.LandAvgTemp.apply(lambda x : ((x-TotalTemp1775)/TotalTemp1775)*100)
display(yrlyLandTemp_20.head())
fig = px.bar(yrlyLandTemp_20, y=yrlyLandTemp_20['% Change since 1775'], x='Year', color='% Change since 1775',
title = "Change in Average Global Land Temperature: 1775 - 2015")
fig.show()
fig = go.Figure()
fig = go.Figure(data=go.Scatter(x=yrlyLandTemp_20['Year'], y=yrlyLandTemp_20['% Change since 1775'],
mode='lines+markers',marker=dict(color='red'), line=dict(color='black')))
fig.update_layout(title_text="Change in Average Global Land Temperature: 1775 - 2015")
fig.update_xaxes(title_text="Year", type='category')
fig.update_yaxes(title_text="% Change since 1775")
fig.show()
#Cleaning, merging and grouping files
LTbyC['dt'] = pd.to_datetime(LTbyC['dt'])
yseries = LTbyC['dt'].dt.year
mseries = LTbyC['dt'].dt.month
LTbyC = pd.DataFrame({'Year' : yseries, 'Month' : mseries, 'AverageTemperature' : LTbyC['AverageTemperature'],
'Country' : LTbyC['Country']})
LTbyC_merged = pd.merge(LTbyC, ctry, on='Country')
LTbyC_merged = LTbyC_merged.groupby(['Year','Code','Country','Month']).sum()
grouped = LTbyC_merged.groupby(['Year','Code','Country']).mean()
grouped.iloc[1752:2012:10]
yearlyLandTemp = grouped.mean(axis=1, skipna=True).mean(level=['Year','Code','Country']).reset_index()
yearlyLandTemp = yearlyLandTemp.rename(columns={0:'AvgTemp'})
#bins = pd.IntervalIndex.from_tuples([(-22,-12),(-12,-2),(-2,2),(2,12), (12,22), (22,32)], closed ='left')
bins = pd.IntervalIndex.from_tuples([(-22,-17),(-17,-12),(-12,-7),(-7,-2), (-2,3), (3,8),(8,13),(13,18),(18,23),(23,28),(28,33)], closed ='left')
yearlyLandTemp['AvgTemp_bin'] = pd.cut(yearlyLandTemp.AvgTemp, bins, ordered =True)
yrlyLandTemp_sort = yearlyLandTemp.sort_values(by=['AvgTemp_bin','Year'], ascending = False)
display(yrlyLandTemp_sort.head())
display(yrlyLandTemp_sort.describe())
fig = px.choropleth(yrlyLandTemp_sort, locations = 'Code',color="AvgTemp_bin", hover_data =['AvgTemp'], hover_name="Country",
title ='Average Land Temperature by Country: 1752 - 2012', projection = 'robinson', height= 800,
animation_frame = 'Year', color_discrete_sequence= px.colors.sequential.thermal)
fig.update_geos(resolution=50)
fig.show()
#Average Land Temperature in Canada: 1833 - 2012
LTbyC = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
display(LTbyC.head())
LTbyC['dt'] = pd.to_datetime(LTbyC['dt'])
yseries = LTbyC['dt'].dt.year
mseries = LTbyC['dt'].dt.month
LTbyC = pd.DataFrame({'Year' : yseries, 'Month' : mseries, 'AverageTemperature' : LTbyC['AverageTemperature'],
'Country' : LTbyC['Country']})
group = LTbyC.loc[LTbyC.Country == 'Canada'].groupby(['Year','Month'])
LTbyCA_sum = group.sum().loc[1833:2012]
CALandTemp = LTbyCA_sum.mean(axis=1, skipna=True).mean(level=['Year']).reset_index()
CALandTemp = CALandTemp.rename(columns={0:'LandAvgTemp'})
CALandTemp_20 = CALandTemp.loc[::10]
TotalTemp1833 = CALandTemp_20.loc[(CALandTemp_20.Year==1833), 'LandAvgTemp']
CALandTemp_20['% change since 1833']=CALandTemp_20.LandAvgTemp.apply(lambda x : ((x-TotalTemp1833)/TotalTemp1833)*100)
display(CALandTemp_20.head())
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Bar(x=CALandTemp_20['Year'], y=CALandTemp_20['% change since 1833'], opacity=0.5,
marker_line_color='rgb(8,200,30)', marker_line_width=2, name="Change since 1833 (%)",
yaxis="y2"))
fig.add_trace(
go.Scatter(x=CALandTemp_20['Year'], y=CALandTemp_20['LandAvgTemp'],name="Average Land Temperature (degrees Celsius)"))
fig.update_layout(title_text="Average Land Temperature in Canada: 1833 - 2015")
fig.update_xaxes(title_text="Year", type='category')
fig.update_yaxes(title_text="Average Land Temperature (degrees Celsius)", secondary_y=False)
fig.update_yaxes(title_text="Change since 1833 (%)", secondary_y=True, autorange="reversed")
fig.show()
LTbyC = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
display(LTbyC.head())
LTbyC['dt'] = pd.to_datetime(LTbyC['dt'])
yseries = LTbyC['dt'].dt.year
mseries = LTbyC['dt'].dt.month
LTbyC = pd.DataFrame({'Year' : yseries, 'Month' : mseries, 'AverageTemperature' : LTbyC['AverageTemperature'],
'Country' : LTbyC['Country']})
group_d = LTbyC.groupby(['Year','Country','Month'])
LTbyC_sum = group_d.sum().loc[1852:2012]
GLandTemp = LTbyC_sum.mean(axis=1, skipna=True).mean(level=['Year', 'Country']).reset_index()
GLandTemp = GLandTemp.rename(columns={0:'LandAvgTemp'})
AvgTemp1852 = GLandTemp.loc[(GLandTemp.Year == 1852)]
AvgTemp1852.set_index(['Country'], inplace=True)
#display(GLandTemp)
CAD = AvgTemp1852.loc['Canada', 'LandAvgTemp']
CHN = AvgTemp1852.loc['China', 'LandAvgTemp']
DNK = AvgTemp1852.loc['Denmark', 'LandAvgTemp']
GER = AvgTemp1852.loc['Germany', 'LandAvgTemp']
IND = AvgTemp1852.loc['India', 'LandAvgTemp']
BRZ = AvgTemp1852.loc['Brazil', 'LandAvgTemp']
IRN = AvgTemp1852.loc['Iran', 'LandAvgTemp']
JPN = AvgTemp1852.loc['Japan', 'LandAvgTemp']
LUX = AvgTemp1852.loc['Luxembourg', 'LandAvgTemp']
MEX = AvgTemp1852.loc['Mexico', 'LandAvgTemp']
RUS = AvgTemp1852.loc['Russia', 'LandAvgTemp']
SAU = AvgTemp1852.loc['Saudi Arabia', 'LandAvgTemp']
ROK = AvgTemp1852.loc['South Korea', 'LandAvgTemp']
CHE = AvgTemp1852.loc['Switzerland', 'LandAvgTemp']
USA = AvgTemp1852.loc['United States', 'LandAvgTemp']
#print(AvgTemp1852)
#Create same bar graph above to check the difference between Country.
def assign1852ByCountry(Country):
if Country =='Canada':
return CAD
elif Country =='China':
return CHN
elif Country =='Denmark':
return DNK
elif Country =='Germany':
return GER
elif Country =='India':
return IND
elif Country =='Brazil':
return BRZ
elif Country =='Iran':
return IRN
elif Country =='Japan':
return JPN
elif Country =='Luxembourg':
return LUX
elif Country =='Mexico':
return MEX
elif Country =='Russia':
return RUS
elif Country =='Saudi Arabia':
return SAU
elif Country =='South Korea':
return ROK
elif Country =='Switzerland':
return CHE
elif Country =='United States':
return USA
GLandTemp['LandAvgTemp1852'] = GLandTemp.Country.apply(assign1852ByCountry)
GLandTemp = GLandTemp.dropna()
function = lambda x, y: (x-y)/y*100
GLandTemp=GLandTemp.assign(percent_change_since_1852= lambda x: ((x['LandAvgTemp']-x['LandAvgTemp1852'])/x['LandAvgTemp1852']*100))
display(GLandTemp)
bar = go.Bar()
fig_b = go.FigureWidget(data=bar)
fig_b.update_yaxes(range=[-70,50])
# A list passed to interact() will yield a drop-down interactor
@interact(Country = ['Brazil','Canada', 'China', 'Denmark', 'Germany','India','Iran','Japan','Luxembourg','Mexico',
'Russia','Saudi Arabia','South Korea','Switzerland','United States'])
def update_bar(Country = 'Germany'):
data = GLandTemp.loc[GLandTemp.Country==Country]
#display(data)
data.set_index('Year', inplace=True)
fig_b.update_traces(x=pd.Series(data.index.values), y=data.percent_change_since_1852, opacity=0.4,
marker_line_color='rgb(8,50,107)', marker_line_width=2, name="Change since 2005 (%)")
fig_b.update_layout(title_text="Average Land Temperature by Country: {0}".format(Country))
fig_b.update_xaxes(title_text="Year", type='category')
fig_b.update_yaxes(title_text="change_since_1852 (%)")
fig_b
To address the guiding question 4 and 5, here is a list of what kind of data wrangling was conducted.
Total GHG Emission (kton. CO2 equivalent) by Year, and by State and YearChange of GHG Emission since 2005 (%) by Year, and by State and YearFacility Group columns based on FacilityCode (string) from origirnal data to investigate the impact of Facility business sectors# Read "GHG emissino data in Canada" csv file
GHG = pd.read_csv("./PDGES-GHGRP-GHGEmissionsGES-2004-Present.csv", encoding = "ISO-8859-1", engine='python')
# Change columns name:
GHG = GHG[["Reference Year / Année de référence" , "Facility Name / Nom de l'installation", "Facility City or District or Municipality / Ville ou District ou Municipalité de l'installation", "Facility Province or Territory / Province ou territoire de l'installation", "Latitude", "Longitude", "Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)", "English Facility NAICS Code Description / Description du code SCIAN de l'installation en anglais"]]
GHG = GHG.rename({"Reference Year / Année de référence": "YEAR", "Facility Name / Nom de l'installation":"FacilityName", "Facility Province or Territory / Province ou territoire de l'installation":"State", "Total Emissions (tonnes CO2e) / Émissions totales (tonnes éq. CO2)":"TotalEmission", "Facility City or District or Municipality / Ville ou District ou Municipalité de l'installation":"FacilityCity", "English Facility NAICS Code Description / Description du code SCIAN de l'installation en anglais":"FacilityCode"}, axis='columns')
for col in GHG.columns:
print(col + ": " + str(GHG[col].dtype))
display(GHG.head())
display(GHG.tail())
display(GHG.describe())
Here is the recorded Annual GHG data at each Facility. You can see a lot of variation of GHG Emission level.
fig = px.violin(GHG, y="TotalEmission", x="YEAR", color="YEAR", box=True,
hover_data=GHG.columns)
fig.show()
TotalEmission data are available at each facility who meet the requirement that emit 10 kilotonnes or more of GHGs, in carbon dioxide (CO2) equivalent (eq.) units, per year. Since Government of Canada changed the reporting threshold which was lowered from 50 kt to 10 kt in 2017. For apple to apple comparison over years, facilities with emissions below 50kt were excluded from this trend analysis.
GHGrev = GHG.loc[(GHG.TotalEmission>=50000)] # Exclude the facilities rows < 50 kt.
GHGrev = GHGrev.loc[(GHG.YEAR >= 2005)]
#display(GHGrev.head())
#display(GHGrev.describe())
fig = px.histogram(GHGrev, x="YEAR", color="State")
fig.update_layout(title='Fig.1 Facilitiy Counts by province :Total Emission Gas >= 50 kt')
fig.update_xaxes(type='category', autorange="reversed")
fig.show()
# Compute Total GHG Emission and Change % since 2005 by Year
AnnualGHG = GHGrev.groupby(['YEAR']).sum().reset_index()
TotalGHG2005 = AnnualGHG.loc[(AnnualGHG.YEAR==2005), 'TotalEmission']
AnnualGHG['% of change since 2005']=AnnualGHG.TotalEmission.apply(lambda x : (x-TotalGHG2005)/TotalGHG2005*100)
display(AnnualGHG)
# Compute Total GHG Emission and Change % since 2005 by Year and State
AnnualGHGState = GHGrev.groupby(['State', 'YEAR']).sum().reset_index()
TotalGHG2005 = AnnualGHGState.loc[(AnnualGHGState.YEAR==2005)]
TotalGHG2005.set_index('State', inplace=True)
AB = TotalGHG2005.loc['Alberta', 'TotalEmission']
BC = TotalGHG2005.loc['British Columbia', 'TotalEmission']
NL = TotalGHG2005.loc['Newfoundland and Labrador', 'TotalEmission']
PE = TotalGHG2005.loc['Prince Edward Island', 'TotalEmission']
NS = TotalGHG2005.loc['Nova Scotia', 'TotalEmission']
NB = TotalGHG2005.loc['New Brunswick', 'TotalEmission']
QC = TotalGHG2005.loc['Quebec', 'TotalEmission']
ON = TotalGHG2005.loc['Ontario', 'TotalEmission']
MB = TotalGHG2005.loc['Manitoba', 'TotalEmission']
SK = TotalGHG2005.loc['Saskatchewan', 'TotalEmission']
NT = TotalGHG2005.loc['Northwest Territories', 'TotalEmission']
#NU = TotalGHG2005.loc['Nunavut', 'TotalEmission'] # No recored of 2005 GHG Emission of State Nunavun
#print(TotalGHG2005)
# Create same bar and line graph above to check the difference between States.
def Assign2005ByState(State):
if State =='Alberta':
return AB
elif State =='British Columbia':
return BC
elif State =='Newfoundland and Labrador':
return NL
elif State =='Prince Edward Island':
return PE
elif State =='Nova Scotia':
return NS
elif State =='New Brunswick':
return NB
elif State =='Quebec':
return QC
elif State =='Ontario':
return ON
elif State =='Manitoba':
return MB
elif State =='Saskatchewan':
return SK
elif State =='Northwest Territories':
return NT
AnnualGHGState['TotalEmission2005'] = AnnualGHGState.State.apply(Assign2005ByState)
#display(AnnualGHGState)
function = lambda x, y: (x-y)/y*100
#AnnualGHGState['% of change since 2005']=AnnualGHGState.apply(lambda x: function(AnnualGHGState.TotalEmission, AnnualGHGState.TotalEmission2005))
AnnualGHGState=AnnualGHGState.assign(Percent_ChangeSince2005= lambda x: ((x['TotalEmission']-x['TotalEmission2005'])/x['TotalEmission2005']*100))
#display(AnnualGHGState)
AnnualGHGState.set_index('State', inplace=True)
display(AnnualGHGState.head())
Facility Group columns based on FacilityCode (string) from origirnal data to investigate the impact of Facility business sectors.GHGindustry = GHGrev.groupby(['State', 'FacilityCode', 'YEAR']).sum().reset_index()
GHGindustry['FacilityCode'] = GHGindustry['FacilityCode'].str.lower()
GHGindustry['FacilityGroup'] = pd.np.where(GHGindustry['FacilityCode'].str.contains("oil"),"Oil and Gas",
pd.np.where(GHGindustry['FacilityCode'].str.contains("gas"),"Oil and Gas",
pd.np.where(GHGindustry['FacilityCode'].str.contains("manufacturing"),"Manufacturing",
pd.np.where(GHGindustry['FacilityCode'].str.contains("mining"),"Mining", "Others"))))
display(GHGindustry)
GHGindistryGroup = GHGindustry.groupby(['FacilityGroup', 'YEAR']).sum().reset_index()
display(GHGindistryGroup.head())
# Read "global temperature by State" csv file
GlobalTempState = pd.read_csv("./GlobalLandTemperaturesByState.csv")
display(GlobalTempState.head())
display(GlobalTempState.tail())
display(GlobalTempState.describe())
GlobalTempState['dt'] = pd.to_datetime(GlobalTempState['dt'])
for col in GlobalTempState.columns:
print(col + ": " + str(GlobalTempState[col].dtype))
# Add Year, Month columns from date.time frame of 'dt'.
GlobalTempState['YEAR']=GlobalTempState['dt'].dt.year
GlobalTempState['MONTH']=GlobalTempState['dt'].dt.month
GlobalTempState=GlobalTempState.loc[(GlobalTempState.Country=="Canada")] # Only the temperature data in Canada will be used for this task.
GlobalTempState = GlobalTempState.drop(['dt', 'AverageTemperatureUncertainty'], axis=1) # Remove redundant columns
def season(Month):
if Month >=3 and Month <= 5:
return "spring"
elif Month >=6 and Month <=8:
return "summer"
elif Month >=9 and Month <=11:
return "autumn"
elif Month ==12 or Month ==1 or Month ==2:
return "winter"
GlobalTempState['Season']=GlobalTempState['MONTH'].apply(season)
display(GlobalTempState)
# Create table by Year, Month and State.
grouped = GlobalTempState.groupby(['YEAR', 'Season', 'MONTH', 'State'])
TempCanada = grouped.sum()
TempCanada=TempCanada.loc[2004:2012]
display(TempCanada.head())
display(TempCanada.tail())
# Compute annual averarge temperature and seasonal temperature for each province.
annualTemp = TempCanada.mean(axis=1, skipna=True).mean(level=['YEAR']).reset_index()
annualTemp = annualTemp.rename(columns={0:'AvgTemp'})
display(annualTemp)
seasonTempState = TempCanada.mean(axis=1, skipna=True).mean(level=['YEAR', 'Season', 'State']).reset_index()
seasonTempState=seasonTempState.rename(columns={0:'AvgTemp'})
display(seasonTempState.head())
5. How Greenhouse Gas (GHG) emission measured at each facilities station decrease or increased in Canada over years?
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Bar(x=AnnualGHG['YEAR'], y=AnnualGHG['% of change since 2005'], opacity=0.4,
marker_line_color='rgb(8,50,107)',
marker_line_width=2, name="Change since 2005 (%)", yaxis="y2"
))
fig.add_trace(
go.Scatter(x=AnnualGHG['YEAR'], y=AnnualGHG['TotalEmission'], name="Total GHG Emission (kton)"
))
fig.update_layout(
title_text="Fig.2 Total GHG Emission in Canada over years"
)
fig.update_xaxes(title_text="Year", type='category')
fig.update_yaxes(title_text="Total GHG Emission (kton)", secondary_y=False)
fig.update_yaxes(title_text="Change since 2005 (%)", secondary_y=True, autorange="reversed")
fig.show()
5-1. Any Total GHG trend difference over years between provinces in Canada?
bar = go.Bar()
fig_bar = go.FigureWidget(data=bar)
fig_bar.update_yaxes(range=[-50, 90])
scatter = go.Scatter()
fig_scatter = go.FigureWidget(data=scatter)
# A list passed to interact() will yield a drop-down interactor
@interact(State = ['Alberta', 'British Columbia', 'Newfoundland and Labrador', 'Prince Edward Island', 'Nova Scotia', 'New Brunswick', 'Quebec', 'Ontario', 'Manitoba', 'Saskatchewan', 'Northwest Territories'])
def update_bar(State = 'Northwest Territories'):
data = AnnualGHGState.loc[State]
#print(data)
data.set_index('YEAR', inplace=True)
fig_bar.update_traces(x=pd.Series(data.index.values),
y=data.Percent_ChangeSince2005, opacity=0.4, marker_line_color='rgb(8,50,107)', marker_line_width=2, name="Change since 2005 (%)")
fig_bar.update_layout(title_text="Fig. 3.1 % Change since 2005 over Years: {0}".format(State))
fig_bar.update_xaxes(title_text="Year", type='category')
fig_bar.update_yaxes(title_text="Change since 2005 (%)")
fig_scatter.update_traces(x=pd.Series(data.index.values),
y=data.TotalEmission, line=dict(color='red', width=2), name="Total GHG Emission (kton)")
fig_scatter.update_layout(title_text="Fig. 3.2 Total GHG Emission (kton) over Years : {0}".format(State))
fig_scatter.update_xaxes(title_text="Year", type='category')
fig_scatter.update_yaxes(title_text="Total GHG Emission (kton)")
fig_bar
fig_scatter
Figure 4 is the spatial mapping for the 2018 GHG Emission Reported to Environment and Climate Chagne Canada, which clearly shows that the highest GHG producers has concentrated on the Alberta.
GHG2018 = GHGrev.loc[(GHG.YEAR==2018)]
GHG2018.dropna(subset=['Latitude', 'Longitude'], inplace=True)
GHG2018 = GHG2018.drop(['YEAR'], axis=1)
#hist = px.histogram(GHG2018, x="TotalEmission")
#hist.show()
def colorbin(GHG):
if GHG >=0 and GHG < 100000:
return "Below100k"
elif GHG >=100000 and GHG < 500000:
return "100k to < 500k"
elif GHG >=500000 and GHG < 1000000:
return "500k to < 1000k"
elif GHG >= 1000000 and GHG < 2000000:
return "1000K to < 2000k"
elif GHG >=2000000:
return "Over 2000k"
GHG2018['2018TotalEmission_kton']=GHG2018['TotalEmission'].apply(colorbin)
# display(GHG2018)
fig = px.scatter_geo(data_frame=GHG2018, lat="Latitude", lon = "Longitude",
color="2018TotalEmission_kton",
color_discrete_map={'Below100k':'blue',
'100k to < 500k':'green',
'500k to < 1000k':'yellow',
'1000K to < 2000k':'orange',
'Over 2000k':'red',
},
category_orders={
"2018TotalEmission_kton": [
"Over 2000k",
"1000K to < 2000k",
"500k to < 1000k",
"100k to < 500k",
"Below100k"
]
},
opacity = 0.8,
title ='Fig.4 2018 Facility Greenhouse Gas Emission Reported to Environment and Climate Change Canada',
scope = 'north america',
width = 1100,
height = 700,
size = 'TotalEmission'
)
fig.update_geos(
visible = False,
scope = 'north america',
showland = True,
landcolor = "rgb(212, 212, 212)",
subunitcolor = "rgb(255, 255, 255)",
countrycolor = "rgb(255, 255, 255)",
showlakes = True,
lakecolor = "rgb(255, 255, 255)",
showsubunits = True,
showcountries = True,
resolution = 50,
projection = dict(
type = 'conic conformal',
rotation_lon = -100
),
lonaxis = dict(
showgrid = True,
gridwidth = 0.5,
range= [ -140.0, -55.0 ],
dtick = 5
),
lataxis = dict (
showgrid = True,
gridwidth = 0.5,
range= [ 20.0, 60.0 ],
dtick = 5
),
)
fig.show()
5-2. Why is there Total GHG Emission difference between Provinces? Does the Business Sector have impaacted to the GHG Emission Level?
fig = px.line(GHGindistryGroup, x='YEAR', y='TotalEmission', color='FacilityGroup')
fig.update_traces(mode='lines+markers')
fig.update_layout(title='Fig.5 Total GHG Emission by Main Facility Sectors: Oil and Gas, Mining, Manufacturing, Others')
fig.show()
GHG2018 = GHGindustry.loc[(GHG.YEAR==2018)]
GHG2018.dropna(subset=['FacilityCode', 'TotalEmission', 'FacilityGroup'], inplace=True)
#display(GHG2018)
fig = px.sunburst(GHG2018, path=['State', 'FacilityGroup', 'FacilityCode'],
values = 'TotalEmission',
color='TotalEmission',
color_continuous_scale='balance')
fig.update_layout(title='Fig. 6 2018 Total GHG Emission (kton) by State and Main Facility Sectors')
fig.show()
#GHG2018.FacilityCode.unique
AnnualGHGState.reset_index(inplace=True)
fig = px.line(AnnualGHGState, x='YEAR', y='TotalEmission', color='State')
fig.update_traces(mode='lines+markers')
fig.update_layout(title='Fig. 7 Total GHG Emission in Canada over years by province')
fig.show()
6. Any relationship between GHG emission and average surface temperatures in Canada?
mergeTable = pd.merge(AnnualGHG, annualTemp, on=['YEAR'])
#display(mergeTable)
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=mergeTable['YEAR'], y=mergeTable['AvgTemp'], opacity=0.4,
marker_line_color='rgb(8,50,107)',
marker_line_width=2, name="Average Temperature (degC)", yaxis="y2"
))
fig.add_trace(
go.Scatter(x=mergeTable['YEAR'], y=AnnualGHG['TotalEmission'], name="Total GHG Emission (kton)"
))
fig.update_layout(
title_text="Fig. 8.1 Average Temperature and Total GHG Emission in Canada over years"
)
fig.update_xaxes(title_text="Year", type='category')
fig.update_yaxes(title_text="Total GHG Emission (kton)", secondary_y=False)
fig.update_yaxes(title_text="Average Temperature (degC)", secondary_y=True)
fig.show()
mergeTableState = pd.merge(AnnualGHGState, seasonTempState, on=['YEAR', 'State'])
#display(mergeTableState)
test = mergeTableState.query("Season == 'summer'")
fig = px.scatter(test, x="AvgTemp", y="TotalEmission", color="State", size = "TotalEmission", hover_data=None, title = "Fig. 8.2 Average Temperature of Summer and Total GHG Emission by Procince")
fig.show()
We succeed to address the guiding question with Python functions: plotly express, Numpy through data wrangling and visualizations.
1Global Climate Change Vital Signs of the Planet, 2020: https://climate.nasa.gov/evidence/
2Greenhouse Gas sources and sinks: executive summary 2020: https://www.canada.ca/en/environment-climate-change/services/climate-change/greenhouse-gas-emissions/sources-sinks-executive-summary-2020.html
3The dataset link: https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
4Greenhouse Gas Emissions Program: https://www.canada.ca/en/environment-climate-change/services/climate-change/greenhouse-gas-emissions.html
5The dataset link: https://open.canada.ca/data/en/dataset/a8ba14b7-7f23-462a-bdbb-83b0ef629823